##################################################################################################################
##********* Moukouba Moutoumounkata, July 2020 *************** ##
## Global Land Average Temperature Time Series Forecasting ##
## ******************************************************* ##
##################################################################################################################
#we will try to build a reproductible experience, although without
#guarantee, due to the extremely random nature of Neural Networks
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import numpy as np
import pandas as pd
from operator import itemgetter
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objs as go
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
# Importing the Keras libraries and packages from TensorFlow
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Input, Dense, Dropout, LSTM, Flatten,
GRU, Bidirectional, TimeDistributed)
from tensorflow.keras.activations import relu, linear
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.models import model_from_json
from tensorflow.keras import backend as K
import tensorflow as tf
import talos
from talos import scan, Evaluate, Reporting
from talos.utils import early_stopper, lr_normalizer
from talos.utils.gpu_utils import parallel_gpu_jobs
from talos.utils.recover_best_model import recover_best_model
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
df = pd.read_csv("/home/moukouba/data_science/python/datasets/globaltemperatures.csv")
df.head()
df.shape
df["dt"] = df.dt.apply(pd.to_datetime, errors='coerce')
df1 = df[["dt", "landaveragetemperature", "landaveragetemperatureuncertainty"]].set_index("dt")
df2 = df[["dt", "landmintemperature", "landmintemperatureuncertainty"]].set_index("dt")
print(df1.isnull().sum())
print(df2.isnull().sum())
df1.head(15)
df1.shape
df2 = df2.dropna()
df2.head()
#We define a function that replaces the missing entries with the long term mean of that specific day of the year
def imputer(df):
#The actual dates with missing values within all the dataset
dates_nan = pd.Series([index for index, row in df.iterrows() if row.isnull().any()])
#The specific days of the year with some missing values (without duplicates)
days_nan = dates_nan.dt.strftime('%m-%d').drop_duplicates()
#The Sets of specific days with missing values
set_with_missing = [[index for index in df.index if day in str(index)] for day in days_nan]
#Now, we can replace the missings with, say, the mean/mode of each set
for missing in dates_nan:
for labels in set_with_missing:
if missing in labels:
df.loc[missing,] = df.loc[labels,].mean()
return df
df1 = imputer(df1)
df1.head(15)
#x1, x2, x3, x4, x5 = itemgetter(0, 1, 2, 3, 4)(np.array_split(X, 5))
x = np.array_split(df1, 3)
for data in x:
fig = px.line(data["landaveragetemperature"])
fig.show()
fig = px.line(df1["landaveragetemperatureuncertainty"])
fig.show()
#z1, z2, z3, z4, z5 = itemgetter(0, 1, 2, 3, 4)(np.array_split(Z, 5))
z = np.array_split(df2, 3)
for data in z:
fig = px.line(data["landmintemperature"])
fig.show()
#Plotting boxplots to check for outliers
trace0 = go.Box(y=df2["landmintemperature"], name="Min_temp")
trace1 = go.Box(y=df1["landaveragetemperature"], name="Avg_temp")
sequences = [trace0, trace1]
fig = go.Figure(sequences)
fig.show()
#Subsetting one month, say February, and plot the boxplots
df3 = df1[df1.index.month == 2]
df4 = df2[df2.index.month == 2]
trace0 = go.Box(y=df4["landmintemperature"], name="Min_temp")
trace1 = go.Box(y=df3["landaveragetemperature"], name="Avg_temp")
sequences = [trace0, trace1]
fig = go.Figure(sequences)
fig.show()
#We define a function that transforms the data to the required shape
def convert2matrix(data_arr, n_steps):
X, y = [], []
for i in range(len(data_arr) - n_steps):
d = i + n_steps
X.append(data_arr[i:d, ])
y.append(data_arr[d, ])
return np.array(X), np.array(y)
#We define a function that splits the data into train, validation and test sets
def data_splitter(df, train_fraction):
train_size = int(len(df)*train_fraction)
train, test = df[0:train_size, ], df[train_size:len(df), ]
return train, test
#Split the data into train, validation and test series
series = df1["landaveragetemperature"].values
train, test = data_splitter(series, 0.85)
train, val = data_splitter(series, 0.75)
print(len(train), len(val), len(test))
#Convert dataset into right shape to turn the problem into a supervised...
#... learning one. We choose n_steps = 36, thus, we use we look back up to...
#... 3 years (36) consecutive months to be able to forecast the next month
n_steps = 36
X_train, y_train = convert2matrix(train, n_steps)
X_val, y_val = convert2matrix(val, n_steps)
X_test, y_test = convert2matrix(test, n_steps)
#Scale the data
b_scaled = X_train.copy()
b_scaled_val = X_val.copy()
b_scaled_test = X_test.copy()
scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(b_scaled)
x_val = scaler.transform(b_scaled_val)
x_test = scaler.transform(b_scaled_test)
# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_val = np.reshape(x_val, (x_val.shape[0], 1, x_val.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))
b_scaled_test.shape
for i in range(5):
print(b_scaled_test[i], y_test[i])
# Setting the dictionary of the hyperparameter to be included in the optimisation process
p = {'epochs': (20, 150, 5),
'neurons1': [ 32, 64, 128, 256],
'neurons2': [32, 64, 128, 256],
'dropout': (0.1, 0.6, 5),
'loss': ['mse', 'mae'],
'activation1':[relu, None,],
'activation2':[linear, None,],
'batch_size': [8, 16, 32, 64, 128],
'optimizer': ['Adam', 'SGD', 'RMSprop']
}
#Define custom coefficient of determination metric
def r_squared(y_true, y_pred):
SS_res = K.sum(K.square(y_true-y_pred))
SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
return (1 - SS_res/(SS_tot + K.epsilon()))
#We define a model builder function. We wrap the first hidden layer into a Bidirectional layer
def model_builder(x_train, y_train, x_val, y_val, params, n_steps = 36, n_features = 1):
tf.keras.backend.clear_session()
model = Sequential([
Bidirectional(GRU(params['neurons1'], return_sequences=True,
activation=params['activation1'], input_shape=(n_features, n_steps))),
GRU(params['neurons2'], activation=params['activation1']),
Dropout(params['dropout']),
Dense(1, activation=params['activation2'])
])
model.compile(optimizer=params['optimizer'], loss=params['loss'], metrics=['mae', r_squared])
history = model.fit(x_train, y_train, epochs=params['epochs'], batch_size=params['batch_size'], verbose=0,
validation_data=[x_val, y_val], callbacks=[early_stopper(epochs=params['epochs'],
mode='moderate', monitor='val_loss')])
return history, model
parallel_gpu_jobs(0.5)
scrutinizer = talos.Scan(x=x_train, y=y_train, x_val=x_test, y_val=y_test, seed=42,
model=model_builder, experiment_name='time_series__gru_hpo_01',
params=p,fraction_limit=0.001, reduction_metric='val_loss')
scrutinizer.details
analyze_object = talos.Analyze(scrutinizer)
analyze_object.data
# The lowest root mean squared error achieved on validation set
best_r_squared = analyze_object.high('val_r_squared')
# The lowest mean absolute error achieved on validation set
best_mae = analyze_object.low('val_mean_absolute_error')
print("The scores for rmse and mae, on validation set are %.5f and %.5f, respectively."%(best_r_squared, best_mae))
# The best models based on respective metrics
best1 = scrutinizer.best_model(metric='val_r_squared', asc=False)
best2 = scrutinizer.best_model(metric='val_mean_absolute_error', asc=False)
# Predicting the Test set results
y_pred_rsq = best1.predict(x_test)
y_pred_mae = best2.predict(x_test)
r2_1 = r2_score(y_test, y_pred_rsq)
r2_2 = r2_score(y_test, y_pred_mae)
print('R-squared for r_squared and mae -based metrics, on test set, are %.5f and %.5f, respectively.'%(r2_1, r2_2))
#
y0 = y_test.flatten()
y1 = y_pred_rsq.flatten()
y2 = y_pred_mae.flatten()
results = pd.DataFrame({"y_test":y0, "y_pred_rsq":y1, "y_pred_mae":y2})
results.tail(20)
xpoints = df.iloc[-443:, 0]
trace0 = go.Scatter(x=xpoints, y=y0, name='Actual Values')
trace1 = go.Scatter(x=xpoints, y=y1, name='Predicted (RSQ)')
trace2 = go.Scatter(x=xpoints, y=y2, name='Predicted (MAE)')
data = [trace0, trace1, trace2]
layout=go.Layout(title="Actual and Predicted Temperatures", xaxis={'title':'Year'}, yaxis={'title':'Temprature'})
fig = go.Figure(data=data, layout=layout)
fig.show()
# We can now save the best model for further use or deployment
# Get the best model index based on the highest 'validation ROC_AUC'
model_id = analyze_object.data[['val_r_squared']].idxmax()[0]
# Clear any previous TensorFlow session.
tf.keras.backend.clear_session()
# Load the model parameters from the scanner.
model = model_from_json(scrutinizer.saved_models[model_id])
model.set_weights(scrutinizer.saved_weights[model_id])
model.summary()
model.save('./avg_temp_best_model.h5')